library(tidyverse)
library(ggplot2)
library(ggpubr)
library(plotly)
library(data.table)
library(rstatix)
library(countrycode)
data <- data.table(read.csv("gapminder_clean.csv")) # used read.csv to leverage ggpubr plotting
data <- data %>% mutate(continent = countrycode(
sourcevar = Country.Name,
origin = "country.name",
destination = "continent"
))
data[is.na(continent), continent := "Unknown"]
data %>%
filter(Year == 1962) %>%
ggscatter(
x = grep("emissions", colnames(data), value = TRUE),
y = "gdpPercap",
add = "reg.line",
cor.coef = TRUE
) +
labs(
x = "CO2 emissions (metric tons per capita)",
y = "GDP per Capita"
)
maxCorrYear <- data %>%
group_by(Year) %>%
mutate(
correlation = cor(
CO2.emissions..metric.tons.per.capita.,
gdpPercap,
use = "complete.obs"
)
) %>%
ungroup() %>%
filter(correlation == max(correlation, na.rm = TRUE))
p <- maxCorrYear %>%
ggscatter(
x = grep("emissions", colnames(data), value = TRUE),
y = "gdpPercap",
size = "pop",
color = "continent"
) +
labs(
x = "CO2 emissions (metric tons per capita)",
y = "GDP per Capita"
)
ggplotly(p)
country_energy_use <- data %>%
group_by(Country.Name) %>%
reframe(
mean_energy_use = na.omit(
mean(
Energy.use..kg.of.oil.equivalent.per.capita.,
na.rm = TRUE
)
)
)
energy_use <- merge(
country_energy_use,
data[!duplicated(Country.Name), c("Country.Name", "continent"), with = FALSE]
)
normality <- energy_use %>%
group_by(continent) %>%
summarise(
statistic = shapiro.test(mean_energy_use)$statistic,
p_value = shapiro.test(mean_energy_use)$p.value
)
normality_tbl <- normality %>%
ggtexttable(
rows = NULL,
theme = ttheme("blank")
) %>%
tab_add_hline(at.row = c(1, 2), row.side = "top", linewidth = 2) %>%
tab_add_hline(at.row = nrow(normality) + 1, row.side = "bottom", linewidth = 2)
normality_tbl
Result: The energy usage within each continent deviates significantly from a normal distribution
variance <- oneway.test(mean_energy_use ~ continent, data = energy_use, var.equal = FALSE)
variance
One-way analysis of means (not assuming equal variances)
data: mean_energy_use and continent
F = 16.971, num df = 5.000, denom df = 56.683, p-value = 3.169e-10
Result: The small p-value indicates that the variances in energy usage across each continent are significantly different. Thus, we have unequal variances
pairwise_result <- pairwise_wilcox_test(
data = energy_use,
formula = mean_energy_use ~ continent,
p.adjust.method = "none"
) %>%
rename("p.signif" = "p.adj.signif") %>%
select(group1, group2, p.signif)
tbl <- pairwise_result %>%
ggtexttable(
rows = NULL,
theme = ttheme("blank")
) %>%
tab_add_hline(at.row = c(1, 2), row.side = "top", linewidth = 2) %>%
tab_add_hline(at.row = nrow(pairwise_result) + 1, row.side = "bottom", linewidth = 2)
x_axis_order <- energy_use %>%
group_by(continent) %>%
summarize(
median = median(mean_energy_use, na.rm = TRUE)
) %>%
arrange(median) %>%
pull(continent)
p <- energy_use %>%
ggboxplot(
x = "continent",
y = "mean_energy_use",
color = "continent",
add = "jitter",
order = x_axis_order,
outlier.shape = NA,
) +
labs(
title = "Energy Use (kg oil equivalent per capita) by Continent",
x = NULL,
y = "Mean energy use (kg of oil equivalent per capita)"
)
p <- ggpar(p, legend = "none")
ggarrange(p, tbl, ncol = 2, nrow = 1)
The figure above indicates that Europe has the highest per capita energy
usage with Africa and Oceania tied for the lowest per capita energy
usage. The energy usage of the Americas, Asia, and entities not
associated with a continent fall somewhere in between
country_percent_imports <- data %>%
filter(continent %in% c("Europe", "Asia") & Year >= 1990) %>%
group_by(Country.Name) %>%
reframe(
mean_import_perc = na.omit(
mean(
Imports.of.goods.and.services....of.GDP.,
na.rm = TRUE
)
)
)
percent_imports <- merge(
country_percent_imports,
data[!duplicated(Country.Name), c("Country.Name", "continent"), with = FALSE]
)
percent_imports %>%
ggboxplot(
x = "continent",
y = "mean_import_perc",
add = "jitter",
xlab = FALSE,
ylab = "Mean import percentage of goods and services",
outlier.shape = NA
) +
stat_compare_means()
As you can see from the Wilcoxon p-value, there is no significant difference between Europe and Asia with respect to the mean import percentage in the years after 1990
country_highest_pop <- data %>%
group_by(Country.Name) %>%
summarize(
mean = mean(pop, na.rm = TRUE)
) %>%
filter(mean == max(mean, na.rm = TRUE)) %>%
pull(Country.Name)
country_highest_pop
[1] "China"
country_highest_life <- data %>%
filter(Year >= 1962 & Year <= 2007) %>%
select(Country.Name, Year, Life.expectancy.at.birth..total..years.) %>%
spread(Year, Life.expectancy.at.birth..total..years.) %>%
mutate(life_expectancy_increase = `2007` - `1962`) %>%
filter(life_expectancy_increase == max(life_expectancy_increase, na.rm = TRUE)) %>%
pull(Country.Name)
country_highest_life
[1] "Maldives"